import pandas as pd
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
# データ準備
df = pd.read_csv('bike_share.csv')
df
| datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2011-01-01 00:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0.0000 | 3 | 13 | 16 |
| 1 | 2011-01-01 01:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0000 | 8 | 32 | 40 |
| 2 | 2011-01-01 02:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0000 | 5 | 27 | 32 |
| 3 | 2011-01-01 03:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0000 | 3 | 10 | 13 |
| 4 | 2011-01-01 04:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0000 | 0 | 1 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10881 | 2012-12-19 19:00:00 | 4 | 0 | 1 | 1 | 15.58 | 19.695 | 50 | 26.0027 | 7 | 329 | 336 |
| 10882 | 2012-12-19 20:00:00 | 4 | 0 | 1 | 1 | 14.76 | 17.425 | 57 | 15.0013 | 10 | 231 | 241 |
| 10883 | 2012-12-19 21:00:00 | 4 | 0 | 1 | 1 | 13.94 | 15.910 | 61 | 15.0013 | 4 | 164 | 168 |
| 10884 | 2012-12-19 22:00:00 | 4 | 0 | 1 | 1 | 13.94 | 17.425 | 61 | 6.0032 | 12 | 117 | 129 |
| 10885 | 2012-12-19 23:00:00 | 4 | 0 | 1 | 1 | 13.12 | 16.665 | 66 | 8.9981 | 4 | 84 | 88 |
10886 rows × 12 columns
カラムの説明
datetime - hourly date + timestamp
season - 1 = spring, 2 = summer, 3 = fall, 4 = winter
holiday - whether the day is considered a holiday
workingday - whether the day is neither a weekend nor holiday
weather -
1: Clear, Few clouds, Partly cloudy, Partly cloudy
2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
temp - temperature in Celsius
atemp - "feels like" temperature in Celsius
humidity - relative humidity
windspeed - wind speed
casual - number of non-registered user rentals initiated
registered - number of registered user rentals initiated
count - number of total rentals
# 日付カラムの確認
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10886 entries, 0 to 10885 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 datetime 10886 non-null object 1 season 10886 non-null int64 2 holiday 10886 non-null int64 3 workingday 10886 non-null int64 4 weather 10886 non-null int64 5 temp 10886 non-null float64 6 atemp 10886 non-null float64 7 humidity 10886 non-null int64 8 windspeed 10886 non-null float64 9 casual 10886 non-null int64 10 registered 10886 non-null int64 11 count 10886 non-null int64 dtypes: float64(3), int64(8), object(1) memory usage: 1020.7+ KB
# datetimeカラムを日付のデータタイプに変換
df['datetime'] = pd.to_datetime(df['datetime'])
# datetimeカラムのDtypeがdatetimeになっていることを確認
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10886 entries, 0 to 10885 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 datetime 10886 non-null datetime64[ns] 1 season 10886 non-null int64 2 holiday 10886 non-null int64 3 workingday 10886 non-null int64 4 weather 10886 non-null int64 5 temp 10886 non-null float64 6 atemp 10886 non-null float64 7 humidity 10886 non-null int64 8 windspeed 10886 non-null float64 9 casual 10886 non-null int64 10 registered 10886 non-null int64 11 count 10886 non-null int64 dtypes: datetime64[ns](1), float64(3), int64(8) memory usage: 1020.7 KB
# 日付データからそれぞれの値を取得
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['dayofyear'] = df['datetime'].dt.dayofyear
df['dayofweek'] = df['datetime'].dt.dayofweek
df['weekofyear'] = df['datetime'].dt.isocalendar().week
df['quarter'] = df['datetime'].dt.quarter
df['isleap'] = df['datetime'].dt.is_leap_year
# データロード
df = pd.read_csv('penguins_size.csv')
df.dropna(inplace=True)
# 多項式特徴量作成
poly = PolynomialFeatures(degree=2, include_bias=False)
df[['culmen_length_mm', 'culmen_depth_mm','culmen_length_mm^2', 'culmen_length_mm * culmen_depth_mm', 'culmen_depth_mm^2' ]] = poly.fit_transform(df[['culmen_length_mm', 'culmen_depth_mm']])
sns.pairplot(df, hue='species')
<seaborn.axisgrid.PairGrid at 0x405fdc94f0>
df['body_mass_g_bin'] = pd.cut(df['body_mass_g'], bins=10, labels=False)
# カテゴリカル変数として扱う場合dtypeをobjectに変換.数値変数として扱ってもよい
df['body_mass_g_bin'] = df['body_mass_g_bin'].astype('object')
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 334 entries, 0 to 343 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 species 334 non-null object 1 island 334 non-null object 2 culmen_length_mm 334 non-null float64 3 culmen_depth_mm 334 non-null float64 4 flipper_length_mm 334 non-null float64 5 body_mass_g 334 non-null float64 6 sex 334 non-null object 7 culmen_length_mm^2 334 non-null float64 8 culmen_length_mm * culmen_depth_mm 334 non-null float64 9 culmen_depth_mm^2 334 non-null float64 10 body_mass_g_bin 334 non-null object dtypes: float64(7), object(4) memory usage: 31.3+ KB
df['culmen_diff'] = df['culmen_length_mm'] - df['culmen_depth_mm']
df['culmen_ratio'] = df['culmen_length_mm'] / df['culmen_depth_mm']
%matplotlib inline
sns.pairplot(df, hue='species')
<seaborn.axisgrid.PairGrid at 0x40625cb250>
# データロード
df_app = pd.read_csv('homecredit_application.csv')
df_prev_app = pd.read_csv('homecredit_previous_application.csv')
# 今回のcredit申請データ
df_app.head(3)
| SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | ... | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1 | 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | 35698.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 rows × 122 columns
# 以前のcredit申請データ
df_prev_app.head(3)
| SK_ID_PREV | SK_ID_CURR | NAME_CONTRACT_TYPE | AMT_ANNUITY | AMT_APPLICATION | AMT_CREDIT | AMT_DOWN_PAYMENT | AMT_GOODS_PRICE | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | ... | NAME_SELLER_INDUSTRY | CNT_PAYMENT | NAME_YIELD_GROUP | PRODUCT_COMBINATION | DAYS_FIRST_DRAWING | DAYS_FIRST_DUE | DAYS_LAST_DUE_1ST_VERSION | DAYS_LAST_DUE | DAYS_TERMINATION | NFLAG_INSURED_ON_APPROVAL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1891648 | 100373 | Consumer loans | 10872.990 | 115695.0 | 115119.0 | 11569.5 | 115695.0 | SATURDAY | 22 | ... | Industry | 12.0 | low_normal | POS household with interest | 365243.0 | -1013.0 | -683.0 | -713.0 | -709.0 | 0.0 |
| 1 | 2670402 | 100077 | Cash loans | NaN | 0.0 | 0.0 | NaN | NaN | WEDNESDAY | 14 | ... | XNA | NaN | XNA | Cash | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 1514511 | 101011 | Consumer loans | 6856.065 | 65866.5 | 66735.0 | 6588.0 | 65866.5 | FRIDAY | 21 | ... | Consumer electronics | 12.0 | middle | POS household with interest | 365243.0 | -596.0 | -266.0 | -296.0 | -289.0 | 0.0 |
3 rows × 37 columns
df_prev_app.sort_values('SK_ID_CURR').head()
| SK_ID_PREV | SK_ID_CURR | NAME_CONTRACT_TYPE | AMT_ANNUITY | AMT_APPLICATION | AMT_CREDIT | AMT_DOWN_PAYMENT | AMT_GOODS_PRICE | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | ... | NAME_SELLER_INDUSTRY | CNT_PAYMENT | NAME_YIELD_GROUP | PRODUCT_COMBINATION | DAYS_FIRST_DRAWING | DAYS_FIRST_DUE | DAYS_LAST_DUE_1ST_VERSION | DAYS_LAST_DUE | DAYS_TERMINATION | NFLAG_INSURED_ON_APPROVAL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2547 | 1038818 | 100002 | Consumer loans | 9251.775 | 179055.0 | 179055.0 | 0.0 | 179055.0 | SATURDAY | 9 | ... | Auto technology | 24.0 | low_normal | POS other with interest | 365243.0 | -565.0 | 125.0 | -25.0 | -17.0 | 0.0 |
| 3507 | 2396755 | 100003 | Consumer loans | 6737.310 | 68809.5 | 68053.5 | 6885.0 | 68809.5 | SATURDAY | 15 | ... | Consumer electronics | 12.0 | middle | POS household with interest | 365243.0 | -2310.0 | -1980.0 | -1980.0 | -1976.0 | 1.0 |
| 1654 | 1810518 | 100003 | Cash loans | 98356.995 | 900000.0 | 1035882.0 | NaN | 900000.0 | FRIDAY | 12 | ... | XNA | 12.0 | low_normal | Cash X-Sell: low | 365243.0 | -716.0 | -386.0 | -536.0 | -527.0 | 1.0 |
| 2934 | 2636178 | 100003 | Consumer loans | 64567.665 | 337500.0 | 348637.5 | 0.0 | 337500.0 | SUNDAY | 17 | ... | Furniture | 6.0 | middle | POS industry with interest | 365243.0 | -797.0 | -647.0 | -647.0 | -639.0 | 0.0 |
| 2669 | 1564014 | 100004 | Consumer loans | 5357.250 | 24282.0 | 20106.0 | 4860.0 | 24282.0 | FRIDAY | 5 | ... | Connectivity | 4.0 | middle | POS mobile without interest | 365243.0 | -784.0 | -694.0 | -724.0 | -714.0 | 0.0 |
5 rows × 37 columns
# 集約値計算
credit_prev_sum = df_prev_app.groupby('SK_ID_CURR').sum()[['AMT_CREDIT']]
# SK_ID_CURRをキーに表を結合
df_app.join(credit_prev_sum, on='SK_ID_CURR', rsuffix='_PREV_SUM')
| SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | ... | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | AMT_CREDIT_PREV_SUM | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | ... | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 179055.0 |
| 1 | 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | 35698.5 | ... | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1452573.0 |
| 2 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | ... | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 20106.0 |
| 3 | 100006 | 0 | Cash loans | F | N | Y | 0 | 135000.0 | 312682.5 | 29686.5 | ... | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | 2625259.5 |
| 4 | 100007 | 0 | Cash loans | M | N | Y | 0 | 121500.0 | 513000.0 | 21865.5 | ... | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 999832.5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 995 | 101152 | 0 | Cash loans | F | N | N | 0 | 112500.0 | 495985.5 | 17946.0 | ... | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 5.0 | 2476305.0 |
| 996 | 101153 | 0 | Cash loans | F | N | Y | 0 | 225000.0 | 1113840.0 | 57001.5 | ... | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 41850.0 |
| 997 | 101154 | 0 | Cash loans | F | Y | Y | 0 | 144000.0 | 517536.0 | 28206.0 | ... | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 478867.5 |
| 998 | 101155 | 0 | Cash loans | M | N | Y | 0 | 315000.0 | 1288350.0 | 37800.0 | ... | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 269901.0 |
| 999 | 101156 | 0 | Cash loans | M | Y | Y | 2 | 180000.0 | 679500.0 | 27076.5 | ... | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 2115810.0 |
1000 rows × 123 columns